벡터 DB 쥬피터 노트북
Download Notebook

환경설정

In [4]:
코드
# !pip install pinecone-client
import dotenv
import os
import pinecone

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_API_ENV"),
)

벡터 데이터베이스

In [47]:
코드
# pinecone.create_index("helloworld-index", dimension=1024)
active_indexes = pinecone.list_indexes()
index_description = pinecone.describe_index("helloworld-index")
print(f'{active_indexes}: {index_description}')
['helloworld-index']: IndexDescription(name='helloworld-index', metric='cosine', replicas=1, dimension=4.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

데이터 넣기

In [48]:
코드
index = pinecone.Index("helloworld-index")

upsert_response = index.upsert(
    vectors=[
        (
         "vec1",                # Vector ID 
         [0.1, 0.2, 0.3, 0.4],  # Dense vector values
         {"genre": "drama"}     # Vector metadata
        ),
        (
         "vec2", 
         [0.2, 0.3, 0.4, 0.5], 
         {"genre": "action"}
        )
    ],
    namespace="helloworld-namespace"
)

데이터 추출

In [49]:
코드
fetch_response = index.fetch(ids=["vec1", "vec2"], namespace="helloworld-namespace")
fetch_response['vectors']['vec2']['values']
# print(fetch_response['values'])
[0.2, 0.3, 0.4, 0.5]

오정보 미트업

텍스트 데이터

In [6]:
코드
with open('../data/LibriSpeech/misinfo_chatGPT.txt', 'r') as f:
    contents = f.read()
print(contents[:100])
Well, thank you very much, Dr. Ahn. This is a real pleasure to be able to speak across the ocean. I 

토큰수

https://platform.openai.com/docs/guides/embeddings/what-are-embeddings

정규표현식

In [7]:
코드
import re

def estimate_tokens(text):
    tokens = re.findall(r'\b\w+\b|\S', text)
    return len(tokens)

print(f"정규표현식 추정토큰수: {estimate_tokens(contents)}")
정규표현식 추정토큰수: 7974

tiktoken

In [8]:
코드
import tiktoken

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""    
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

# num_tokens_from_string("Hello World!",)
num_tokens=num_tokens_from_string(contents)
num_tokens
7852
In [9]:
코드
# 출처: https://github.com/OpsConfig/OpenAI_Lab/blob/3a8c55160a6790fc790ef1c2c797d83c716eee94/Context-based-search-Version2.ipynb
# Based on https://openai.com/api/pricing/ on 01/29/2023
# If you were using this for approximating pricing with Azure OpenAI adjust the values below with: https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/

#MODEL  USAGE
#Ada     v1 $0.0040 / 1K tokens
#Babbage v1 $0.0050 / 1K tokens
#Curie   v1 $0.0200 / 1K tokens
#Davinci v1 $0.2000 / 1K tokens

#MODEL  USAGE
#Ada     v2 $0.0004 / 1K tokens
#This Ada model, text-embedding-ada-002, is a better and lower cost replacement for our older embedding models. 

n_tokens_sum = num_tokens

ada_v1_embeddings_cost = (n_tokens_sum/1000) *.0040
babbage_v1_embeddings_cost = (n_tokens_sum/1000) *.0050
curie_v1_embeddings_cost = (n_tokens_sum/1000) *.02
davinci_v1_embeddings_cost = (n_tokens_sum/1000) *.2

ada_v2_embeddings_cost = (n_tokens_sum/1000) *.0004

print("Number of tokens: " + str(n_tokens_sum) + "\n")

print("MODEL        VERSION    COST")
print("-----------------------------------")
print("Ada" + "\t\t" + "v1" + "\t$" + '%.8s' % str(ada_v1_embeddings_cost))
print("Babbage" + "\t\t" + "v1" + "\t$" + '%.8s' % str(babbage_v1_embeddings_cost))
print("Curie" + "\t\t" + "v1" + "\t$" + '%.8s' % str(curie_v1_embeddings_cost))
print("Davinci" + "\t\t" + "v1" + "\t$" + '%.8s' % str(davinci_v1_embeddings_cost))
print("Ada" + "\t\t" + "v2" + "\t$" + '%.8s' %str(ada_v2_embeddings_cost))
Number of tokens: 7852

MODEL        VERSION    COST
-----------------------------------
Ada     v1  $0.031408
Babbage     v1  $0.03926
Curie       v1  $0.15704
Davinci     v1  $1.570400
Ada     v2  $0.003140

임베딩

In [10]:
코드
import openai

openai.api_key  = os.getenv('ENV_OPENAI_API_KEY')

def generate_embeddings(text, model="text-embedding-ada-002"):
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

#contents_embedding = generate_embeddings(contents)
코드
len(contents_embedding)

텍스트 쪼개기

In [12]:
코드
import pandas as pd

sentences = contents.split(". ")

df = pd.DataFrame(sentences, columns=['text'])

print(df.head())
                                                text
0                      Well, thank you very much, Dr
1                                                Ahn
2  This is a real pleasure to be able to speak ac...
3  I wish I was there in person, but I did get th...
4                               So thank you so much

임베딩

In [14]:
코드
import uuid

def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# embedding = get_embedding("Your text goes here", model="text-embedding-ada-002")
# print(len(embedding))
# df['n_tokens'] = df["embedding"].apply(lambda x: len(x))


df["embedding"] = df.text.apply(lambda x: get_embedding(x))
df['vector_id'] = [str(uuid.uuid4()) for _ in range(len(df))]

df.to_csv("misinfo-embeddings.csv")

벡터 데이터베이스

In [15]:
코드
# Pick a name for the new index
index_name = 'misinfo'

# Check whether the index with the same name already exists - if so, delete it
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)
    
# Creates new index
pinecone.create_index(name=index_name, dimension=len(df['embedding'][0]))
index = pinecone.Index(index_name=index_name)

# Confirm our index was created
pinecone.list_indexes()
['misinfo']

임베딩 DB 삽입

https://towardsdatascience.com/crud-with-pinecone-ee6b6f8b54e8

In [16]:
코드
import itertools

def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

for batch in chunks([(str(t), v) for t, v in zip(df.vector_id, df.embedding)]):
    index.upsert(vectors=batch, namespace = "misinfo_namespace")

index.describe_index_stats()    
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'misinfo_namespace': {'vector_count': 368}},
 'total_vector_count': 368}

검색

In [17]:
코드
pd.set_option('display.max_colwidth', None)

from openai.embeddings_utils import get_embedding, cosine_similarity

def get_embedding(text, model="text-embedding-ada-002"):
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

df_similarities = df.copy()

def search_docs(df, user_query, top_n=3):
    embedding = get_embedding(user_query, model="text-embedding-ada-002")

    df_similarities["similarities"] = df.embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df_similarities.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    
    return res

question = "why misinformation is dangerous?\n\n"

res = search_docs(df, question, top_n=3)
res.text
186    And we know that, you know, our surgeon general and other major leaders around the world have recognized the ways in which misinformation can affect our health, and they can affect the health of democracies
130                                                                                                                                                                And bullshit is a part of the misinformation story
16                                                                                                                We've studied misinformation during the pandemic and during elections and all sorts of other topics
Name: text, dtype: object

시각화

코드
from nomic import atlas
import nomic

nomic.login(os.getenv('NOMIC_API_KEY'))

project = atlas.map_embeddings(
    embeddings=df['embedding'].to_numpy()    
)